// Copyright (c) Meta Platforms, Inc. and affiliates.

#ifndef ZSTRONG_COMPRESS_TOKENIZE2TO1_H
#define ZSTRONG_COMPRESS_TOKENIZE2TO1_H

#if defined(__cplusplus)
extern "C" {
#endif

#include <stddef.h> // size_t
#include <stdint.h> // uintx

/* ZS_tokenize2to1_encode():
 * accepts as input an array of 2-bytes symbols (fixed size).
 * result:
 * @return : alphabetSize (nb of different 2-bytes symbols)
 * - dstAlphabet : the sorted list of symbols actually present in srcSymbols
 *                (typically favorable for a delta transform)
 *    + alphabetCapacity is expressed in nb of 2-bytes symbols
 *    + alphabetCapacity must be >= 256 to cover worst case situations
 * - within dstIndex : the list of indexes, using 1 byte per index.
 *    + There are necessarily nbSymbols indexes written into dstIndex
 *    + indexCapacity must be >= nbSymbols
 *
 * Conditions : the nb of different symbols (alphabetSize) MUST be <= 256
 *              all pointers are presumed valid and non NULL
 *
 * Note : this presumes a capability to know that alphabetSize <= 256
 *        not provided in this unit
 *
 * Note 2 : this function uses 64 KB of stack memory.
 *          To avoid this usage, invoke ZS_tokenize2to1_encode_wksp() below
 */

size_t ZS_tokenize2to1_encode(
        uint8_t* dstIndex,
        size_t indexCapacity,
        uint16_t* dstAlphabet,
        size_t alphabetCapacity,
        const uint16_t* srcSymbols,
        size_t nbSymbols);

/* ZS_tokenize2to1_encode_wksp():
 * same as ZS_tokenize2to1_encode(), but does not use stack space.
 * Instead, an already allocated workspace must be passed as argument.
 * @workspace must be valid and writable,
 * and have a size >= ZS_TOKENIZE2TO1_WORKSPACE_SIZE.
 * @workspace content is throw-away, it does not need to be preserved between
 * invocations.
 */
#define ZS_TOKENIZE2TO1_WORKSPACE_SIZE TOK2_CARDINALITY_MAX
size_t ZS_tokenize2to1_encode_wksp(
        void* workspace,
        uint8_t* dstIndex,
        size_t indexCapacity,
        uint16_t* dstAlphabet,
        size_t alphabetCapacity,
        const uint16_t* srcSymbols,
        size_t nbSymbols);

/* TOK2_cardinality() :
 * detect and count nb of alphabet symbols present.
 * @return: alphabetSize
 * note : the content of @present is write-only,
 *        to be then forwarded to TOK2_encode_into1().
 */
#define TOK2_CARDINALITY_MAX (256 * 256)
size_t TOK2_numSort_cardinality(
        uint8_t present[TOK2_CARDINALITY_MAX],
        const uint16_t* srcSymbols,
        size_t nbSymbols);

/* TOK2_encode_into1() :
 * encode 2-bytes symbols present in @srcSymbols
 * into an alphabet of unique symbols, written into @dstAlphabet,
 * and a list of indexes, written into @dstIndex.
 * note 1 : The caller must know alphabetSize before calling this method!
 *          It only works for alphabetSize <= 256 !
 *          and @alphabetCapacity must be >= alphabetSize.
 * note 2 : @srcSymbols must be properly aligned for 16-bit boundaries.
 *          it's interpreted as an integer using host endian.
 * note 3 : the generated alphabet is "ordered"
 *          when interpreted as an integer using host endian.
 * note 4 : @present should be the content generated by TOK2_cardinality().
 *          It will be altered during the encoding process.
 * note 5 : this function cannot fail.
 *          all its working conditions *must* be respected, aka
 *          all buffers must be valid and correctly sized.
 */
void TOK2_numSort_encode_into1(
        uint8_t* dstIndex,
        size_t indexCapacity,
        uint16_t* dstAlphabet,
        size_t alphabetCapacity,
        const uint16_t* srcSymbols,
        size_t nbSymbols,
        uint8_t* present);

/* TOK2_fsf_*():
 * These functions do exactly the same as above
 * but they are meant to work on serialized data,
 * as opposed to native numeric fields.
 * As a consequence, 16-bit input alignment is no longer required (odd addresses
 * allowed), and interpreting tokens is independent from host endianess.
 *
 * While this additional responsibility could be expected to make these variants
 * slower, in practice, on modern cpus compatible with unaligned memory
 * accesses and using little endian convention, with a modern powerful compiler,
 * performance is expected to be similar, if not identical, to numeric variant
 * above.
 *
 * The produced dictionary will be ordered when tokens are interpreted as
 * unsigned 16-bit values using little-endian convention.
 * This is neither in input occurrence order, nor in alphabetical order.
 * Consequently, this seems only suitable for a tokenize transform which
 * either doesn't offer any guarantee on dictionary order
 * or provides little endian ordering specifically.
 */
size_t TOK2_fsf_cardinality(
        uint8_t present[TOK2_CARDINALITY_MAX],
        const void* src2BSymbols,
        size_t nbSymbols);

void TOK2_fsf_encode_into1(
        uint8_t* dstIndex,
        size_t indexCapacity,
        void* dst2BAlphabet,
        size_t alphabetCapacity,
        const void* src2BSymbols,
        size_t nbSymbols,
        uint8_t* present);

#if defined(__cplusplus)
}
#endif

#endif // ZSTRONG_COMPRESS_TRANSFORMS_DELTA_ENCODE_H
